Introduction
Analysis Questions
Question 1
# Load required Libraries
library(ggplot2)
library(plyr)
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## <U+2713> tibble 2.1.3 <U+2713> dplyr 0.8.3
## <U+2713> tidyr 1.0.0 <U+2713> stringr 1.4.0
## <U+2713> readr 1.3.1 <U+2713> forcats 0.4.0
## <U+2713> purrr 0.3.3
## -- Conflicts ------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::arrange() masks plyr::arrange()
## x purrr::compact() masks plyr::compact()
## x dplyr::count() masks plyr::count()
## x dplyr::failwith() masks plyr::failwith()
## x dplyr::filter() masks stats::filter()
## x dplyr::id() masks plyr::id()
## x dplyr::lag() masks stats::lag()
## x dplyr::mutate() masks plyr::mutate()
## x dplyr::rename() masks plyr::rename()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(class)
library(e1071)
# Read in brewery data
brewery_data = read.csv('C:/Users/Tpeng/OneDrive/Documents/SMU/Doing Data Science/Unit 8 Project/Breweries.csv', header = TRUE, na.strings =c('', 'NA'))
# Create an object that counts breweries by state, to be used for reordering data
state_counts = data.frame(table(brewery_data$State))
colnames(state_counts) = c("State", "Freq")
state_counts = state_counts[order(state_counts$Freq),]
# Plot the ordered results
state_counts %>% ggplot(aes(x = reorder(State, Freq), y = Freq, fill = State)) +
geom_bar(stat = 'identity') +
coord_flip() +
ggtitle('Number of Breweries by State') +
ylab('Count') +
xlab('State')
library(ggplot2)
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
## The following object is masked from 'package:plyr':
##
## ozone
library(dplyr)
library(mapproj)
# Create dataframe with state information and change column names to merge
new_brewery_data = brewery_data
lookup = data.frame(abb = state.abb, State = state.name)
colnames(new_brewery_data)[4] = "abb"
brewery_data2 = merge(new_brewery_data,lookup, by ="abb", all.x = TRUE, no.dups = F)
brewery_data2 = brewery_data2[!brewery_data2$abb == ' DC',]
levels(brewery_data2$State) = levels(lookup$State)
levels(lookup$abb) = levels(brewery_data2$abb)
for (row in 1:nrow(brewery_data2)){
abb = as.character(brewery_data2$abb[row])
abb = trimws(abb)
brewery_data2$State[row] = state.name[which(state.abb == abb)]
}
# Count breweries in each state and manipulate variables to match states dataframe
NumBreweryMapData = count(brewery_data2, State)
colnames(NumBreweryMapData)[2] = "Breweries"
NumBreweryMapData$region <- tolower(NumBreweryMapData$State)
NumBreweryMapData2 = NumBreweryMapData[-1]
states <- map_data("state")
map.df <- merge(states,NumBreweryMapData2, by="region", all.x=T)
map.df <- map.df[order(map.df$order),]
# Get rough location of middle of each state to plot number of breweries overtop of the states
tibble = map.df %>% group_by(region) %>% summarise(midlat = mean(c(max(lat), min(lat))), midlong = mean(c(max(long), min(long))))
for (row in 1:nrow(map.df)){
region = map.df$region[row]
midlat = tibble[tibble$region == region,]$midlat
midlong = tibble[tibble$region == region,]$midlong
map.df$midlat[row] = midlat
map.df$midlong[row] = midlong
}
ggplot(map.df, aes(x=long,y=lat,group=group)) +
geom_polygon(aes(fill=Breweries)) +
geom_path() +
scale_fill_gradientn(colours=rev(heat.colors(10)),na.value="grey90") +
ggtitle("Number of Breweries in Each State") +
coord_map() +
geom_text(label = map.df$Breweries, x = map.df$midlong, y = map.df$midlat) +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 10 rows containing missing values (geom_text).
Question 2
# Read in Beer data and merge with brewery data
beer_data = read.csv('C:/Users/Tpeng/OneDrive/Documents/SMU/Doing Data Science/Unit 8 Project/Beers.csv', header = TRUE, na.strings = c('', 'NA'))
dataset = merge(brewery_data, beer_data, by.x = 'Brew_ID', by.y = 'Brewery_id', all = TRUE)
# Print first and last 6 observations
dataset[1:6,]
## Brew_ID Name.x City State Name.y Beer_ID ABV IBU
## 1 1 NorthGate Brewing Minneapolis MN Pumpion 2689 0.060 38
## 2 1 NorthGate Brewing Minneapolis MN Stronghold 2688 0.060 25
## 3 1 NorthGate Brewing Minneapolis MN Parapet ESB 2687 0.056 47
## 4 1 NorthGate Brewing Minneapolis MN Get Together 2692 0.045 50
## 5 1 NorthGate Brewing Minneapolis MN Maggie's Leap 2691 0.049 26
## 6 1 NorthGate Brewing Minneapolis MN Wall's End 2690 0.048 19
## Style Ounces
## 1 Pumpkin Ale 16
## 2 American Porter 16
## 3 Extra Special / Strong Bitter (ESB) 16
## 4 American IPA 16
## 5 Milk / Sweet Stout 16
## 6 English Brown Ale 16
tail(dataset, 6)
## Brew_ID Name.x City State
## 2405 556 Ukiah Brewing Company Ukiah CA
## 2406 557 Butternuts Beer and Ale Garrattsville NY
## 2407 557 Butternuts Beer and Ale Garrattsville NY
## 2408 557 Butternuts Beer and Ale Garrattsville NY
## 2409 557 Butternuts Beer and Ale Garrattsville NY
## 2410 558 Sleeping Lady Brewing Company Anchorage AK
## Name.y Beer_ID ABV IBU Style Ounces
## 2405 Pilsner Ukiah 98 0.055 NA German Pilsener 12
## 2406 Porkslap Pale Ale 49 0.043 NA American Pale Ale (APA) 12
## 2407 Snapperhead IPA 51 0.068 NA American IPA 12
## 2408 Moo Thunder Stout 50 0.049 NA Milk / Sweet Stout 12
## 2409 Heinnieweisse Weissebier 52 0.049 NA Hefeweizen 12
## 2410 Urban Wilderness Pale Ale 30 0.049 NA English Pale Ale 12
Question 3
# Remove beers missing values for Style
dataset = dataset[!is.na(dataset$Style),]
# Calculate average values for ABV and IBU and store in a dataframe
avg_abv = dataset %>% group_by(Style) %>% summarize(meanABV = mean(ABV, na.rm = TRUE))
avg_abv = as.data.frame(avg_abv)
grand_mean_abv = dataset %>% summarize(grand_mean_abv = mean(ABV, na.rm = TRUE))
grand_mean_ibu = dataset %>% summarize(grand_mean_ibu = mean(IBU, na.rm = TRUE))
avg_ibu = dataset %>% group_by(Style) %>% summarize(meanibu = mean(IBU, na.rm = TRUE))
avg_ibu = as.data.frame(avg_ibu)
# Replace missing average IBU's with grand mean IBU in IBU dataframe
avg_ibu$meanibu = ifelse(is.na(avg_ibu$meanibu), grand_mean_ibu, avg_ibu$meanibu)
# Replace missing ABV's and IBU's in full dataset with averages by Style
for (row in 1:nrow(dataset)){
if(is.na(dataset[row, 'ABV'])){
dataset[row, 'ABV'] = avg_abv[avg_abv$Style == dataset$Style[row],]$meanABV[1]
}
if(is.na(dataset[row, 'IBU'])){
dataset[row, 'IBU'] = avg_ibu[avg_ibu$Style == dataset$Style[row],]$meanibu[1]
}
}
Question 4
# Find and display median values
median_abv = dataset %>% group_by(State) %>% summarize(medianABV = median(ABV))
median_abv = as.data.frame(median_abv)
median_ibu = dataset %>% group_by(State) %>% summarize(medianIBU = median(IBU))
median_ibu = as.data.frame(median_ibu)
# Determine which observation has maximum values and output the state
median_abv$State[which.max(median_abv$medianABV)] # Kentucky
## [1] KY
## 51 Levels: AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL ... WY
median_abv$medianABV[which.max(median_abv$medianABV)]
## [1] 0.06452758
median_ibu$State[which.max(median_ibu$medianIBU)] # Delaware
## [1] DE
## 51 Levels: AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL ... WY
median_ibu$medianIBU[which.max(median_ibu$medianIBU)]
## [1] 59.81728
median_data = merge(median_abv, median_ibu, by = 'State')
median_abv %>% ggplot(aes(x = reorder(State, medianABV, fun = median), y = medianABV)) +
geom_point(col = 'tomato2', size = 2) +
geom_segment(aes(x = median_abv$State,
xend = State,
y = min(median_abv$medianABV),
yend = max(median_abv$medianABV)),
linetype = 'dashed',
size = 0.1) +
coord_flip() +
ggtitle('Median ABV By State') +
xlab('State') + ylab('Median ABV') +
theme(plot.title = element_text(hjust = 0.5))
median_ibu %>% ggplot(aes(x = reorder(State, medianIBU, fun = median), y = medianIBU)) +
geom_point(col = 'tomato2', size = 2) +
geom_segment(aes(x = median_ibu$State,
xend = State,
y = min(median_ibu$medianIBU),
yend = max(median_ibu$medianIBU)),
linetype = 'dashed',
size = 0.1) +
coord_flip() +
ggtitle('Median IBU By State') +
xlab('State') + ylab('Median IBU') +
theme(plot.title = element_text(hjust = 0.5))
library(ggplot2)
library(maps)
library(dplyr)
library(mapproj)
# Create dataframe with state information and change column names to merge
lookup = data.frame(abb = state.abb, State = state.name)
colnames(median_data)[1] = "abb"
median_data2 = merge(median_data,lookup, by ="abb", all.x = TRUE, no.dups = F)
median_data2 = median_data2[!median_data2$abb == ' DC',]
levels(median_data2$State) = levels(lookup$State)
levels(lookup$abb) = levels(median_data2$abb)
for (row in 1:nrow(median_data2)){
abb = as.character(median_data2$abb[row])
abb = trimws(abb)
median_data2$State[row] = state.name[which(state.abb == abb)]
}
median_data2$region <- tolower(median_data2$State)
states <- map_data("state")
map.df <- merge(states,median_data2, by="region", all.x=T)
map.df <- map.df[order(map.df$order),]
map.df$Percent = map.df$medianABV * 100
ggplot(map.df, aes(x=long,y=lat,group=group)) +
geom_polygon(aes(fill=Percent)) +
geom_path() +
scale_fill_gradientn(colours=rev(rainbow(10)),na.value="grey90") +
ggtitle("Median ABV in Each State") +
coord_map() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(fill = 'ABV (%)')
ggplot(map.df, aes(x=long,y=lat,group=group)) +
geom_polygon(aes(fill=medianIBU)) +
geom_path() +
scale_fill_gradientn(colours=rev(rainbow(10)),na.value="grey90") +
ggtitle("Median IBU in Each State") +
coord_map() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(fill = 'IBU Rating')
#Utah has much stricter laws regulating alcohol due to the high concentration of Mormon’s and their beliefs regarding alcohol.
Question 5
state_maxABV = dataset$State[which.max(dataset$ABV)]
state_maxIBU = dataset$State[which.max(dataset$IBU)]
state_maxABV # Colorado
## [1] CO
## 51 Levels: AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL ... WY
dataset$ABV[which.max(dataset$ABV)]
## [1] 0.128
state_maxIBU # Oregon
## [1] OR
## 51 Levels: AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL ... WY
dataset$IBU[which.max(dataset$IBU)]
## [1] 138
max_abv = dataset %>% group_by(State) %>% summarize(maxABV = max(ABV))
max_abv = as.data.frame(max_abv)
max_ibu = dataset %>% group_by(State) %>% summarize(maxIBU = max(IBU))
max_ibu = as.data.frame(max_ibu)
max_data = merge(max_abv, max_ibu, by = 'State')
max_abv %>% ggplot(aes(x = reorder(State, maxABV, fun = median), y = maxABV)) +
geom_point(col = 'tomato2', size = 2) +
geom_segment(aes(x = max_abv$State,
xend = State,
y = min(max_abv$maxABV),
yend = max(max_abv$maxABV)),
linetype = 'dashed',
size = 0.1) +
coord_flip() +
ggtitle('Max ABV By State') +
xlab('State') + ylab('Max ABV') +
theme(plot.title = element_text(hjust = 0.5))
max_ibu %>% ggplot(aes(x = reorder(State, maxIBU, fun = median), y = maxIBU)) +
geom_point(col = 'tomato2', size = 2) +
geom_segment(aes(x = max_ibu$State,
xend = State,
y = min(max_ibu$maxIBU),
yend = max(max_ibu$maxIBU)),
linetype = 'dashed',
size = 0.1) +
coord_flip() +
ggtitle('Max IBU By State') +
xlab('State') + ylab('Max IBU') +
theme(plot.title = element_text(hjust = 0.5))
# Manipulate variable names to enable merging dataframes
lookup = data.frame(abb = state.abb, State = state.name)
colnames(max_data)[1] = "abb"
max_data2 = merge(max_data,lookup, by ="abb", all.x = TRUE, no.dups = F)
max_data2 = max_data2[!max_data2$abb == ' DC',]
levels(max_data2$State) = levels(lookup$State)
levels(lookup$abb) = levels(max_data2$abb)
for (row in 1:nrow(max_data2)){
abb = as.character(max_data2$abb[row])
abb = trimws(abb)
max_data2$State[row] = state.name[which(state.abb == abb)]
}
max_data2$region <- tolower(max_data2$State)
states <- map_data("state")
# Merge dataframes
map.df <- merge(states,max_data2, by="region", all.x=T)
map.df <- map.df[order(map.df$order),]
map.df$Percent = map.df$maxABV * 100
ggplot(map.df, aes(x=long,y=lat,group=group)) +
geom_polygon(aes(fill=maxABV)) +
geom_path() +
scale_fill_gradientn(colours=rev(rainbow(10)),na.value="grey90") +
ggtitle("Max ABV in Each State") +
coord_map() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(fill = 'ABV (%)')
ggplot(map.df, aes(x=long,y=lat,group=group)) +
geom_polygon(aes(fill=maxIBU)) +
geom_path() +
scale_fill_gradientn(colours=rev(rainbow(10)),na.value="grey90") +
ggtitle("Max IBU in Each State") +
coord_map() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(fill = 'IBU Rating')
Question 6
###6. Comment on the summary statistics and distribution of ABV
# Get summary statistics
summary(dataset$ABV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00100 0.05000 0.05647 0.05975 0.06700 0.12800
mean(dataset$ABV)
## [1] 0.05975184
sd(dataset$ABV)
## [1] 0.01345838
# Plot Overall ABV distribution colored by state.
dataset %>% ggplot(aes(fill = State)) +geom_histogram(aes(x = dataset$ABV), binwidth = 0.005)
popular_beers_tibble = count(dataset, Style)
popular_beers_tibble = popular_beers_tibble %>% filter(popular_beers_tibble$n > 75)
popular_beers = merge(dataset, popular_beers_tibble, by = 'Style', all.x = FALSE, all.y = TRUE)
popular_beers %>% ggplot() +
geom_histogram(aes(x = popular_beers$ABV * 100, fill = Style)) +
facet_wrap(popular_beers$Style) + theme(legend.position = 'none') +
xlab('ABV (%)')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
popular_beers %>% ggplot() + geom_histogram(aes(x = popular_beers$IBU, fill = Style)) +
facet_wrap(popular_beers$Style) +
theme(legend.position = 'none') +
xlab('IBU Rating')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Question 7
dataset %>% ggplot(aes(x = ABV * 100, y = IBU)) +
geom_point(position = 'jitter') +
stat_smooth(method = 'lm', se = FALSE) +
ggtitle("Relationship Between IBU and ABV") +
theme(legend.position = 'none', plot.title = element_text(hjust = 0.5)) +
xlab('ABV (%)')
# Create datasets for IPA's and Ales, remove any IPA's that are in ale dataset. Condense beers into 2 styles, IPA and Ale, then combine into a single dataframe
ipa_dataset = dataset %>% filter(grepl('.*IPA.*', dataset$Style))
ale_dataset = dataset %>% filter(grepl('.*Ale.*', dataset$Style))
ale_dataset = ale_dataset %>% filter(!grepl('.*IPA.*', ale_dataset$Style))
ipa_dataset$Style = c('IPA')
ale_dataset$Style = c('Ale')
ipa_ale_dataset = rbind(ipa_dataset, ale_dataset)
ipa_ale_dataset %>% ggplot(aes(x = ABV * 100, y = IBU, color = Style)) +
geom_point(position = 'jitter') +
ggtitle('Alcohol Content vs Bitterness') +
scale_color_manual(values = c('red', 'blue')) +
theme(plot.title = element_text(hjust = 0.5)) +
stat_smooth(method = 'lm', se = FALSE) +
xlab('ABV (%)')
# Filter into two groups: IPA and Ale
ipa_dataset = dataset %>% filter(grepl('.*IPA.*', dataset$Style))
ale_dataset = dataset %>% filter(grepl('.*Ale.*', dataset$Style))
ale_dataset = ale_dataset %>% filter(!grepl('.*IPA.*', ale_dataset$Style))
ipa_dataset$Style = c('IPA')
ale_dataset$Style = c('Ale')
ipa_ale_dataset = rbind(ipa_dataset, ale_dataset)
splitpercent = .70
trainIndices = sample(1:dim(ipa_ale_dataset)[1], round(splitpercent * dim(ipa_ale_dataset)[1]))
train = ipa_ale_dataset[trainIndices,]
test = ipa_ale_dataset[-trainIndices,]
iterations = 100
numks = 30
masteracc = matrix(nrow = iterations, ncol = numks)
splitpercent = .70
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(numks), k = numeric(numks))
trainIndices = sample(1:dim(ipa_ale_dataset)[1], round(splitpercent * dim(ipa_ale_dataset)[1]))
train = ipa_ale_dataset[trainIndices,]
test = ipa_ale_dataset[-trainIndices,]
for (i in 1:numks)
{
classifications = knn(train[,c(7,8)],test[,c(7,8)],train$Style, prob = TRUE, k = i)
tab = table(classifications,test$Style)
CM = confusionMatrix(tab)
masteracc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masteracc)
plot(seq(1,numks,1), MeanAcc, type = 'l')
classifications = knn(train[,c(7,8)],test[,c(7,8)],train$Style, prob = TRUE, k = 5)
CM = confusionMatrix(table(test$Style,classifications))
CM
## Confusion Matrix and Statistics
##
## classifications
## Ale IPA
## Ale 258 19
## IPA 25 158
##
## Accuracy : 0.9043
## 95% CI : (0.8737, 0.9296)
## No Information Rate : 0.6152
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7992
##
## Mcnemar's Test P-Value : 0.451
##
## Sensitivity : 0.9117
## Specificity : 0.8927
## Pos Pred Value : 0.9314
## Neg Pred Value : 0.8634
## Prevalence : 0.6152
## Detection Rate : 0.5609
## Detection Prevalence : 0.6022
## Balanced Accuracy : 0.9022
##
## 'Positive' Class : Ale
##
accs = data.frame(accuracy = numeric(90), k = numeric(90))
for (i in 1:90)
{
classifications = knn.cv(ipa_ale_dataset[,c(7,8)], ipa_ale_dataset$Style, prob = TRUE, k = i)
CM = confusionMatrix(table(classifications, ipa_ale_dataset$Style))
accs$accuracy[i] = CM$overal[1]
accs$k[i] = i
}
plot(accs$k, accs$accuracy, type = 'l', xlab = 'k')
classifications = knn.cv(ipa_ale_dataset[,c(7,8)], ipa_ale_dataset$Style, prob = TRUE, k = 5)
CM = confusionMatrix(table(classifications, ipa_ale_dataset$Style))
CM
## Confusion Matrix and Statistics
##
##
## classifications Ale IPA
## Ale 894 64
## IPA 69 507
##
## Accuracy : 0.9133
## 95% CI : (0.8981, 0.9269)
## No Information Rate : 0.6278
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8148
##
## Mcnemar's Test P-Value : 0.7287
##
## Sensitivity : 0.9283
## Specificity : 0.8879
## Pos Pred Value : 0.9332
## Neg Pred Value : 0.8802
## Prevalence : 0.6278
## Detection Rate : 0.5828
## Detection Prevalence : 0.6245
## Balanced Accuracy : 0.9081
##
## 'Positive' Class : Ale
##
class_df = data.frame('classification' = classifications)
probability = attributes(classifications)[3][1]
probability = unlist(probability[1])
for (var in probability){
attr(probability, 'names') <- NULL
}
class_df = cbind(class_df, probability)
class_df = cbind(ipa_ale_dataset$Style, class_df, ipa_ale_dataset$ABV, ipa_ale_dataset$IBU)
names(class_df) = c('Truth', 'Classification', 'Probability', 'ABV', 'IBU')
class_df$Correct = ifelse(class_df$Truth == class_df$Classification, 'Correct Prediction', 'Incorrect Prediction')
class_df$Correct = ordered(class_df$Correct, levels = c("Incorrect Prediction", "Correct Prediction"))
class_df$Numeric = ifelse(class_df$Truth == 'IPA', 1, 0)
class_df$AleProb = ifelse(class_df$Classification == 'Ale', class_df$Probability, 1 - class_df$Probability)
library(plotly)
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
class_df$`Ale Probability`= class_df$AleProb * 100
model_plot <- plot_ly(class_df, x = ~ABV, y = ~IBU, z = ~`Ale Probability`, color = ~Correct, colors = 'Set1')
model_plot <- model_plot %>% add_markers()
model_plot <- model_plot %>% layout(title = 'Predicted Style By ABV and IBU', scene = list(xaxis = list(title = 'ABV'),
yaxis = list(title = 'IBU'),
zaxis = list(title = 'Probability of Being an Ale (%)')))
model_plot
class_df %>% ggplot(aes(x = IBU, y = ABV * 100, color = Correct)) +
geom_point(position = 'jitter') +
ggtitle('KNN Model Performance') +
scale_color_manual('Model Performance', values = c('red', 'blue')) +
theme(plot.title = element_text(hjust = 0.5)) +
ylab('ABV (%)')
library(ISLR)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
ipa_ale_dataset$Style = as.factor(ipa_ale_dataset$Style)
splitpercent = .70
trainIndices = sample(1:dim(ipa_ale_dataset)[1], round(splitpercent * dim(ipa_ale_dataset)[1]))
ipa_ale_train = ipa_ale_dataset[trainIndices,]
ipa_ale_test = ipa_ale_dataset[-trainIndices,]
# Create dataframe of test set and model results
ipa_ale_test$Predicted_Style = fitted.results
ipa_ale_test$Correct = ifelse(ipa_ale_test$Style == ipa_ale_test$Predicted_Style, 'Correct Prediction', 'Incorrect Prediction')
ipa_ale_test %>% ggplot(aes(x = IBU, y = ABV * 100, color = Correct)) +
geom_point(position = 'jitter') +
ggtitle('Logistic Model Performance') +
scale_color_manual('Model Performance', values = c('blue', 'red')) +
theme(plot.title = element_text(hjust = 0.5)) +
ylab('ABV (%)')
library(plotly)
logmodel_plot <- plot_ly(ipa_ale_test, x = ~IBU, y = ~ABV, z = ~Prob_IPA *100, color = ~Correct, colors = c('blue', 'red'))
logmodel_plot <- logmodel_plot %>% add_markers()
logmodel_plot <- logmodel_plot %>% layout(title = 'Logistic Regression Model Performance Probability', scene = list(xaxis = list(title = 'IBU'),
yaxis = list(title = 'ABV'),
zaxis = list(title = 'Probability of Being an IPA (%)')))
logmodel_plot
Question 9
# Filter into multiple groups
ipa_dataset = dataset %>% filter(grepl('.*IPA.*', dataset$Style))
wheat_dataset = dataset %>% filter(grepl('.*Wheat.*', dataset$Style) | grepl('.*Hefeweizen.*', dataset$Style) | grepl('.*Witbier.*', dataset$Style))
ale_dataset = dataset %>% filter(grepl('.* Ale.*', dataset$Style))
ale_dataset = ale_dataset %>% filter(!grepl('.*IPA.*', ale_dataset$Style))
ale_dataset = ale_dataset %>% filter(!grepl('.* Pale.*', ale_dataset$Style))
ale_dataset = ale_dataset %>% filter(!grepl('.*Wheat*', ale_dataset$Style))
pale_ale_dataset = dataset %>% filter(grepl('.* Pale.*', dataset$Style))
pale_ale_dataset = pale_ale_dataset %>% filter(!grepl('.*IPA.*', pale_ale_dataset$Style))
lager_dataset = dataset %>% filter(grepl('.*Lager.*', dataset$Style))
stout_dataset = dataset %>% filter(grepl('.*Stout.*', dataset$Style))
pilsner_dataset = dataset %>% filter(grepl('.*Pilsner.*', dataset$Style))
porter_data = dataset %>% filter(grepl('.*Porter.*', dataset$Style))
ipa_dataset$Style = c('IPA')
ale_dataset$Style = c('Ale')
pale_ale_dataset$Style = c('Pale Ale')
lager_dataset$Style = c('Lager')
stout_dataset$Style = c('Stout')
pilsner_dataset$Style = c('Pilsner')
wheat_dataset$Style = c('Wheat Beer')
porter_data$Style = c('Porter')
styled_dataset = rbind(ipa_dataset, ale_dataset, pale_ale_dataset, lager_dataset, stout_dataset, pilsner_dataset, wheat_dataset, porter_data)
styled_dataset = styled_dataset[order(styled_dataset$State),]
styled_dataset$Style = as.factor(styled_dataset$Style)
styled_dataset$State = as.factor(styled_dataset$State)
# Find favorite beer in each state, and create a table containing the statistics.
beer_counts = styled_dataset %>% group_by(State) %>% count(Style)
beer_counts = beer_counts[with(beer_counts, order(beer_counts$State, -beer_counts$n)),]
beer_count_spread = spread(beer_counts, Style, n)
beer_count_spread[is.na(beer_count_spread)] = 0
# Save the data for dc before removing the DC observations. Necessary for heat map to work as written
beer_counts_dc = beer_counts
beer_counts = subset(beer_counts, beer_counts$State != ' DC')
row = 1
for (row in 1:nrow(beer_count_spread)){
beer_count_spread$Total[row] = sum(beer_count_spread[row ,2:9])
row = row + 1
}
## Warning: Unknown or uninitialised column: 'Total'.
favorites = data.frame(matrix('', ncol = 2, nrow = 50))
names(favorites) = c('State', 'Style')
levels(favorites$State) = levels(styled_dataset$State)
levels(favorites$Style) = levels(styled_dataset$Style)
favorites$State = max_data2$abb
favorites$Style = c(NA)
favorites$Style = as.factor(favorites$Style)
faclevels = addNA(levels(beer_counts$Style))
levels(favorites$Style) = faclevels
row =1
faverow = 1
for (row in 1:nrow(beer_counts)) {
state = beer_counts$State[row]
style = beer_counts$Style[row]
if(is.na(favorites[favorites$State == state,]$Style)) {
favorites$Style[faverow] = style
faverow = faverow + 1
}
row = row + 1
}
levels(favorites$Style) = levels(styled_dataset$Style)
# Create Heatmap of favorite beers by State
library(ggplot2)
library(maps)
library(dplyr)
library(mapproj)
# Create a dataframe containing state information and manipulate variables to merge dataframes later
lookup = data.frame(abb = state.abb, State = state.name)
colnames(favorites)[1] = "abb"
favorites2 = merge(favorites,lookup, by ="abb", all.x = TRUE, no.dups = F)
levels(favorites2$State) = levels(lookup$State)
levels(lookup$abb) = levels(favorites2$abb)
for (row in 1:nrow(favorites2)){
abb = as.character(favorites2$abb[row])
abb = trimws(abb)
favorites2$State[row] = state.name[which(state.abb == abb)]
}
favorites2$region <- tolower(favorites2$State)
states <- map_data("state")
map.df <- merge(states,favorites2, by="region", all.x=T)
map.df <- map.df[order(map.df$order),]
map.df$region <- as.factor(map.df$region)
row = 1
for (row in 1:nrow(map.df)) {
if (map.df$region[row] == 'district of columbia') {
map.df$Style[row] = 'Ale'
row = row + 1
}
}
# Plot the heatmap
ggplot(map.df, aes(x=long,y=lat,group=group)) +
geom_polygon(aes(fill=Style)) +
geom_path() +
ggtitle('Most Brewed Beer in Each State') +
coord_map() +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(values = c('red2', 'orange2', 'chartreuse3', 'blue3', 'cyan1', 'orchid', 'springgreen2', 'turquoise3'), drop = FALSE)
Conclusion